/*******************************************************************************
* Objective: Create IPUMS education attainment data
*******************************************************************************/

* Set Stata version
version 16

* Stablish Working Directory ***************************************************
cd "$workdirectory"

* Settings *********************************************************************
capture log close
clear all
set more off

********************************************************************************
**# Part 1: Educational Attainment Data
********************************************************************************

foreach fi in lac_a65_edattain asia_a65_edattain africa_a65_edattain fiji_a65_edattain png_a65_edattain{
	import delimited raw_datasets/IPUMS_education/`fi'.csv, varnames(11) rowrange(12) clear
	tempfile data`fi'
	save `data`fi'', replace
}

* Combining data
clear all
use `datalac_a65_edattain', clear
append using `dataasia_a65_edattain'
append using `dataafrica_a65_edattain'
append using `datafiji_a65_edattain'
append using `datapng_a65_edattain'

* Keep weighted # of workers
keep if v1 == "Weighted N"
drop v1

* Creating new Sudan 
foreach vari in niunotinuniverse lessthanprimarycompleted primarycompleted secondarycompleted universitycompleted unknown rowtotal{
	replace `vari' = `vari'[183] + `vari'[184] if v2 == "729200801 Sudan 2008" 
}

* Generating country and year variable
rename v2 census
drop if census == "COL TOTAL"
gen country = regexs(0) if regexm(census, "(([a-zA-Z]+)[ ]*([a-zA-Z]+))")
gen year = regexs(0) if(regexm(census, "[0-9][0-9][0-9][0-9]$"))
destring year, replace

* Replacing NIU with 0 if missing
replace niunotinuniverse = 0 if niunotinuniverse == .

* Creating Shares
gen share_less_primary 		= (lessthanprimarycompleted/(rowtotal-niunotinuniverse))*100
la var share_less_primary 	"Share of Less than primary completed (%)"
gen share_primary 			= (primarycompleted/(rowtotal-niunotinuniverse))*100
la var share_primary 		"Share of Primary completed (%)"
gen share_secondary 		= (secondarycompleted/(rowtotal-niunotinuniverse))*100
la var share_secondary	 	"Share of Secondary completed (%)"
gen share_university 		= (universitycompleted/(rowtotal-niunotinuniverse))*100
la var share_university 	"Share of University completed (%)"
gen share_unknown	 		= (unknown/(rowtotal-niunotinuniverse))*100
la var share_unknown	 	"Share of Unknown (%)"

* Making dataset compatible with main dataset
drop if country == "Armenia"
drop if country == "Palestine"
drop if country == "Saint Lucia"
drop if country == "Israel"
drop if country == "Kyrgyz Republic"
replace country = "Venezuela, RB" if country == "Venezuela"
replace country = "Iran, Islamic Rep." if country == "Iran"
replace country = "Egypt, Arab Rep." if country == "Egypt"
replace country = "Lao PDR" if country == "Laos"
replace country = "Papua New Guinea" if country == "Papua New"
drop if country == "South Sudan"

* Surveys
gen type_census = "census"
replace type_census = "Intercensal Survey" if country == "Cambodia" & year == 2004
replace type_census = "Intercensal Survey" if country == "Cambodia" & year == 2013
replace type_census = "Employment Survey"  if country == "India"
replace type_census = "Intercensal Survey" if country == "Indonesia" & year == 1976
replace type_census = "Intercensal Survey" if country == "Indonesia" & year == 1985
replace type_census = "Intercensal Survey" if country == "Indonesia" & year == 1995
replace type_census = "Intercensal Survey" if country == "Indonesia" & year == 2005
replace type_census = "Household Survey"   if country == "Nigeria"
replace type_census = "Household Survey"   if country == "Uruguay" & year == 2006

* Labeling
la var niunotinuniverse 			"Total Not in Universe"
la var lessthanprimarycompleted 	"Total Less than primary completed"
la var primarycompleted 			"Total Primary completed"
la var secondarycompleted 			"Total Secondary completed"
la var universitycompleted 			"Total University completed"
la var unknown 						"Total Unknown"
la var rowtotal						"Total People"
la var type_census					"Indicates if census or type of survey"

save "processed_datasets/dataset_education_a65_ipums_19032022", replace

********************************************************************************
**# Part 2: Years of Education Data
********************************************************************************

import excel "raw_datasets\IPUMS_education\yrschool_file.xlsx", sheet("Sheet1") firstrow clear
gen country = regexs(0) if regexm(census, "(([a-zA-Z]+)[ ]*([a-zA-Z]+))")
gen year = regexs(0) if(regexm(census, "[0-9][0-9][0-9][0-9]$"))

save "processed_datasets/yrschool.dta", replace
